#' ---
#' title: Reproduce Figure 3 (Manifesto Ideology)
#' date: 2024-10-01
#' version: 1.0
#' ---

library(tidyverse)
library(patchwork)
library(ggrepel)

## Load and Clean Benoit et. al (2016) Data ----------------------------

manifestos <- read_csv('application3-benoit-manifesto-estimates.csv')

# pivot longer: one row for each (manifesto x policy area) combination
expert_manifestos <- manifestos |> 
  select(manifestoid, 
         Social = expert_social,
         Economic = expert_economic) |> 
  pivot_longer(cols = c('Social', 'Economic'),
               names_to = 'policy',
               values_to = 'expert_position')

crowd_manifestos <- manifestos |> 
  select(manifestoid, 
         Social = crowd_social,
         Economic = crowd_economic) |> 
  pivot_longer(cols = c('Social', 'Economic'),
               names_to = 'policy',
               values_to = 'crowd_position')

manifestos <-  left_join(expert_manifestos, crowd_manifestos)


## Plot Crowd-Coded Positions ---------------------

p1 <- ggplot(data = manifestos,
       mapping = aes(x = crowd_position,
                     y = expert_position,
                     label = manifestoid)) +
  facet_wrap(~policy) +
  geom_text_repel() +
  theme_bw() +
  labs(x = 'Crowd-Coded Position',
       y = 'Expert Position')

ggsave(plot = p1,
       filename = 'fig3a.png',
       width = 8,
       height = 5)

## Load and Clean Sentence-Level GPT-3 Labels ---------------------

ideology <- read_csv('application3-one-shot-gpt-3-ideology.csv')

policy <- read_csv('application3-one-shot-gpt-3-policy.csv')

# get the policy labels and probabilities
policy_labels <- policy |>  
  select(sentenceid, token1:token5) |>  
  pivot_longer(cols = token1:token5,
               values_to = 'policy') |>  
  select(-name)

policy_probabilities <- policy |> 
  select(sentenceid, prob1:prob5) |> 
  pivot_longer(cols = prob1:prob5,
               values_to = 'probability')

policy_labels$prob <- policy_probabilities$probability

# remove whitespace and capitalize. Replace blank with Neither
policy_labels <- policy_labels |>  
  mutate(policy = str_trim(policy)) |>  
  mutate(policy = str_to_title(policy)) |> 
  mutate(policy = if_else(policy == '', 'Neither', policy))

# sum the probabilities and keep the highest for each sentenceid
policy_labels <- policy_labels |> 
  group_by(sentenceid, policy) |> 
  summarize(policy_confidence = sum(prob)) |> 
  group_by(sentenceid) |> 
  slice_max(policy_confidence, n = 1)

gpt3_sentences <- policy |> 
  select(-`...1`) |> 
  left_join(policy_labels, by = 'sentenceid')


# Assign each sentence a Conservative/Liberal/Neither probability
ideology_labels <- ideology |>  
  select(sentenceid, token1:token5) |>  
  pivot_longer(cols = token1:token5,
               values_to = 'ideology') |>  
  select(-name)

ideology_probabilities <- ideology |> 
  select(sentenceid, prob1:prob5) |> 
  pivot_longer(cols = prob1:prob5,
               values_to = 'probability')

ideology_labels$prob <- ideology_probabilities$probability

# remove whitespace and capitalize. 
ideology_labels <- ideology_labels |>  
  mutate(ideology = str_trim(ideology)) |>  
  mutate(ideology = str_to_title(ideology)) |> 
  # Replace blank with Neither, Right/Conservatives with Conservative, and Left/Labour with Liberal
  mutate(ideology = case_when(ideology == '' ~ 'Neither', 
                              ideology == 'Conservatives' ~ 'Conservative',
                              ideology == 'Labour' ~ 'Liberal',
                              ideology == 'Right' ~ 'Conservative',
                              ideology == 'Left' ~ 'Liberal',
                              TRUE ~ ideology))

# sum the probabilities and pivot
ideology_labels <- ideology_labels |> 
  group_by(sentenceid, ideology) |> 
  summarize(prob = sum(prob)) |> 
  filter(ideology %in% c('Conservative', 'Neither', 'Liberal')) |> 
  ungroup() |> 
  pivot_wider(names_from = 'ideology',
              values_from = 'prob',
              values_fill = 0)


# merge with dataset
gpt3_sentences <- gpt3_sentences |> 
  left_join(ideology_labels, by = 'sentenceid')

# normalize probabilities to sum to 1
gpt3_sentences <- gpt3_sentences |> 
  mutate(p_sum = Conservative + Liberal + Neither) |> 
  mutate(Conservative = Conservative / p_sum,
         Liberal = Liberal / p_sum,
         Neither = Neither / p_sum) |> 
  select(-p_sum) |> 
  # ideology measure is P(Conservative) - P(Liberal)
  mutate(gpt3_ideology = Conservative - Liberal)

gpt3_manifestos <- gpt3_sentences |> 
  filter(policy %in% c('Economic', 'Social')) |> 
  select(manifestoid, policy, gpt3_ideology) |> 
  group_by(manifestoid, policy) |> 
  summarize(gpt3_position = mean(gpt3_ideology),
            num_sentences = n())

## Plot GPT-3 Coded Positions ---------------------

manifestos <- left_join(manifestos, gpt3_manifestos)

p2 <- ggplot(data = manifestos,
       mapping = aes(x = gpt3_position,
                     y = expert_position,
                     label = manifestoid)) +
  facet_wrap(~policy) +
  geom_text_repel() +
  theme_bw() +
  labs(x = 'GPT-3 Position',
       y = 'Expert Position')


ggsave(plot = p2,
       filename = 'fig3b.png',
       width = 8,
       height = 5)


